import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import transformers
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
import numpy as np
Syllogism Validation with BERT
Python
Deep Learning
NLP
Given two premises this validation model can classify validity with 85% accuracy on a 50/50 split dataset.
= pd.read_csv('Avicenna_Train.csv', encoding='ISO-8859-1') df
df.head()
Premise 1 | Premise 2 | Syllogistic relation | Conclusion | |
---|---|---|---|---|
0 | unchecked imbalances in the society, will see ... | correct these imbalances requires in-depth kno... | no | No conclusion |
1 | Chronic diseases are heart attacks and stroke,... | In populations that eat a regular high-fiber d... | yes | In populations that eat a regular high-fiber d... |
2 | Formative assessment encourages children to en... | An ideal learning environment uses formative a... | yes | An ideal learning environment encourages child... |
3 | Underrepresented female labor force in some pr... | Job discrimination comes with underrepresented... | yes | Job discrimination comes with not being able t... |
4 | damaged mentality in an individual brings seri... | Aggression harms the mentality of person. | yes | Aggression brings brings serious health proble... |
'label'] = df['Syllogistic relation'].eq('yes').mul(1) df[
'text'] = (df['Premise 1'] + " : " + df['Premise 2']) df[
'label'].value_counts() df[
1 2427
0 2373
Name: label, dtype: int64
int(len(df) * 0.8)
3840
= df.iloc[:3840]['text'].values
train_texts = df.iloc[:3840]['label'].values
train_labels
= df.iloc[3840:]['text'].values
valid_texts = df.iloc[3840:]['label'].values valid_labels
= DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tokenizer
= tokenizer(list(train_texts), truncation=True, padding=True)
train_encodings = tokenizer(list(valid_texts), truncation=True, padding=True) valid_encodings
class SyllogismDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
= {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item 'labels'] = torch.tensor(self.labels[idx])
item[return item
def __len__(self):
return len(self.labels)
= SyllogismDataset(train_encodings, train_labels)
train_dataset = SyllogismDataset(valid_encodings, valid_labels) valid_dataset
= torch.utils.data.DataLoader2(train_dataset, batch_size=16, shuffle=True)
train_dataloader = torch.utils.data.DataLoader2(valid_dataset, batch_size=16, shuffle=True) valid_dataloader
= DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') model
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
= 'cuda' DEVICE
model.train()
= load_metric('accuracy') metrics
def compute_metrics(eval_pred):
= eval_pred
logits, labels
= np.argmax(logits, axis=-1)
predictions return metrics.compute(predictions=predictions, references=labels)
= TrainingArguments(output_dir='./results', num_train_epochs=3, per_device_train_batch_size=16,
training_args =16, logging_dir='./logs', logging_steps=72)
per_device_eval_batch_size
= Trainer(model=model,
trainer =training_args,
args=train_dataset,
train_dataset=valid_dataset,
eval_dataset=compute_metrics
compute_metrics )
trainer.train()
/usr/local/lib/python3.9/dist-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
***** Running training *****
Num examples = 3840
Num Epochs = 3
Instantaneous batch size per device = 16
Total train batch size (w. parallel, distributed & accumulation) = 16
Gradient Accumulation steps = 1
Total optimization steps = 720
[720/720 01:36, Epoch 3/3]
Step | Training Loss |
---|---|
72 | 0.658500 |
144 | 0.492300 |
216 | 0.413400 |
288 | 0.298300 |
360 | 0.253200 |
432 | 0.216700 |
504 | 0.178600 |
576 | 0.106900 |
648 | 0.106800 |
720 | 0.091800 |
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Training completed. Do not forget to share your model on huggingface.co/models =)
TrainOutput(global_step=720, training_loss=0.281636557314131, metrics={'train_runtime': 97.56, 'train_samples_per_second': 118.081, 'train_steps_per_second': 7.38, 'total_flos': 289110097566720.0, 'train_loss': 0.281636557314131, 'epoch': 3.0})
trainer.evaluate()
***** Running Evaluation *****
Num examples = 960
Batch size = 16
[60/60 02:07]
{'eval_loss': 0.4387502670288086,
'eval_accuracy': 0.88125,
'eval_runtime': 2.2301,
'eval_samples_per_second': 430.476,
'eval_steps_per_second': 26.905,
'epoch': 3.0}
= pd.read_csv('Avicenna_Test.csv', encoding='ISO-8859-1')
df_test
'label'] = df_test['Syllogistic relation'].eq('yes').mul(1)
df_test['text'] = (df_test['Premise 1'] + " : " + df_test['Premise 2'])
df_test[
= df_test['text'].values
test_texts = df_test['label'].values
test_labels
= tokenizer(list(test_texts), truncation=True, padding=True)
test_encodings
= SyllogismDataset(test_encodings, test_labels)
test_dataset
= torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True) test_dataloader
trainer.evaluate(test_dataset)
***** Running Evaluation *****
Num examples = 1200
Batch size = 16
{'eval_loss': 0.5759531855583191,
'eval_accuracy': 0.8525,
'eval_runtime': 2.8515,
'eval_samples_per_second': 420.837,
'eval_steps_per_second': 26.302,
'epoch': 3.0}
= ['Socrates is a man : all men are mortal']
sample_text = [1] sample_label
= tokenizer(sample_text, truncation=True, padding=True) sample_encoded
= SyllogismDataset(sample_encoded, sample_label)
sample_dataset sample_dataset
<__main__.SyllogismDataset at 0x7f63a4fccd60>
trainer.predict(sample_dataset).label_ids
***** Running Prediction *****
Num examples = 1
Batch size = 16
array([1])
= ['If the streets are wet, it has rained recently : The streets are wet.']
sample_text_2 = [0]
sample_label_2
= tokenizer(sample_text_2, truncation=True, padding=True)
sample_encoded_2
= SyllogismDataset(sample_encoded_2, sample_label_2)
sample_dataset_2
trainer.predict(sample_dataset_2).label_ids
***** Running Prediction *****
Num examples = 1
Batch size = 16
array([0])